#import necessary python libraries
import numpy as np
from sklearn.linear_model import LinearRegression
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.decomposition import PCA
from scipy.stats import zscore
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
#Load the dataset
vehicle_df = pd.read_csv("https://raw.githubusercontent.com/vanjimohan/aiml/master/vehicle-1-1.csv")
#Check the head of dataset
vehicle_df.head()
#Check for missing values
vehicle_df.isnull().sum()
#replace the missing values with NaN
vehicle_df.replace('', np.NaN, inplace=True)
#Filling the missing values with mean since all the columns are numerical
vehicle_df.fillna(vehicle_df.mean(), inplace=True)
#Verifying whether any other null values present on the dataset
vehicle_df.isnull().sum()
#Scaling the dataset with zscore to apply box plot to identify the outliers
vehicle_scaled = vehicle_df.drop('class', axis = 1).apply(zscore)
#Ploting boxplot to see the outlier data on each column
cols = vehicle_scaled.columns
position = 1
plt.figure(figsize=(20,10))
plt.xticks(rotation='vertical')
plt.ylim(-2, 2)
for col in cols:
plt.boxplot(vehicle_scaled[col], positions = [position], labels = [col] )
position += 1
#Removing Outliers
print("Shape before removing Outliers: ", vehicle_scaled.shape)
vehicle_filtered = vehicle_scaled[(vehicle_scaled < 3).all(axis=1)]
print("Shape after removing Outliers: ", vehicle_filtered.shape)
print((vehicle_scaled.shape[0] - vehicle_filtered.shape[0]) ,"records removed due to outliers")
y = vehicle_df[(vehicle_scaled < 3).all(axis=1)]['class']
print((vehicle_df.shape[0] - y.shape[0]) ,"records removed due to outliers")
#pair plot with the outliers removed dataset
sns.pairplot(vehicle_filtered,diag_kind='kde')
By seeing the pairplot there are lot of columns having dependency with other columns even though on ideal case all of these should be independent variables. Compactness has positive correlation with Circularity, distance-circularity, radius-ratio, scatter-ratio, pr_axis_rectangularity, max_length_rectangularity, scaled variance1. Like wise there are other variables are more correlated with each other either positive / negatively. Let's check the correlation matrix,
#Reusable method to get the Correlation from the matrix (taken from Stackoverflow)
#https://stackoverflow.com/questions/17778394/list-highest-correlation-pairs-from-a-large-correlation-matrix-in-pandas
def get_redundant_pairs(df):
pairs_to_drop = set()
cols = df.columns
for i in range(0, df.shape[1]):
for j in range(0, i+1):
pairs_to_drop.add((cols[i], cols[j]))
return pairs_to_drop
def get_top_abs_correlations(df, n=5):
au_corr = df.corr().abs().unstack()
labels_to_drop = get_redundant_pairs(df)
au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
return au_corr[0:n]
vehicle_df_target_drop = vehicle_df.drop('class', axis = 1)
#Getting top 15 correlation
get_top_abs_correlations(vehicle_df_target_drop, n=15)
There are more correlated variables. Exclude the columns which are having correlation >= 0.9
Through this the below columns can be excluded,
scaled_variance.1
pr.axis_rectangularity
elongatedness
max.length_rectangularity
scaled_radius_of_gyration
corr = vehicle_df.corr()
plt.subplots(figsize=(20,15))
ax = sns.heatmap(corr, vmin=-1, vmax=1, center=0, square=True,cmap ="YlGnBu",linewidths = 0.1)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right');
#excluding columns based on the correlation coefficients
# Now the data is ready to apply the model without performing PCA, vehicle_excluded can be used.
excludes = ['scaled_variance.1','pr.axis_rectangularity','elongatedness','max.length_rectangularity','scaled_radius_of_gyration']
vehicle_excluded = vehicle_filtered.drop(excludes,axis=1)
#Performing Priciple Component Analysis.
#Getting Covariance Matrix for the outliers removed data vehicle_filtered.
covMatrix = np.cov(vehicle_filtered,rowvar=False)
covMatrix
#Performing PCA
pca = PCA(n_components=18)
pca.fit(vehicle_filtered)
print("Eigen values: ",np.around(pca.explained_variance_,decimals=4))
print(pca.explained_variance_ratio_)
#Ploting the Eigen Values
plt.figure(figsize=(20,10))
plt.grid(axis='both')
plt.bar(list(range(1,19)),pca.explained_variance_ratio_,alpha=0.5, align='center')
plt.ylabel('Variation explained')
plt.xlabel('eigen Value')
plt.yticks(np.arange(0, 0.7, 0.05))
plt.xticks(np.arange(1, 18, 1))
#Ploting the variation vs Eigen Values, adding grid lines to find out how many components required to
#get more than 95% representation of original data.
plt.figure(figsize=(20,10))
plt.grid(axis='both')
plt.yticks(np.arange(0.5, 1, 0.05))
plt.xticks(np.arange(1, 18, 1))
plt.step(list(range(1,19)),np.cumsum(pca.explained_variance_ratio_), where='mid')
plt.ylabel('Sum of variation explained')
plt.xlabel('eigen Value')
Based on the plot 7 components required to get more than 95% of representation of original data.
pcaReduced = PCA(n_components=7)
pcaReduced.fit(vehicle_filtered)
print(pcaReduced.components_)
print(pcaReduced.explained_variance_ratio_)
XpcaReduced = pcaReduced.transform(vehicle_filtered)
sns.pairplot(pd.DataFrame(XpcaReduced))
#This pairplot shows that there is not relationship between these variables.
#Reusable method to do the SVC model
def doSVCModel(model,X):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30, random_state=1)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
score = model.score(X_test, y_test)
print("Score: ", score)
print("Modelling with Original dataset by applying manual exclusions")
doSVCModel(SVC(gamma='auto'), vehicle_excluded)
print("Modelling with PCA reduction dataset")
doSVCModel(SVC(gamma='auto'),XpcaReduced)
By performing the PCA the number of columns were reduced to 7 and hence the accurancy score is reduced 3.22%. However it is with respect to the data available with us. There are chances that the model will perform better in production rather than having original values for training the model. By reducing the number of columns the storage and computational power also gained.
# Performing grid Search for C, Kernals
def svc_grid_Search(X, y):
Cs = [0.01, 0.05, 0.5, 1]
kernels = ['linear','rbf']
param_grid = {'C': Cs, 'kernel' : kernels}
grid_search = GridSearchCV(SVC(gamma='auto'), param_grid, cv=5)
grid_search.fit(X, y)
grid_search.best_params_
return grid_search.best_params_
# getting the best parameters through Grid Search.
result = svc_grid_Search(XpcaReduced,y)
print("Best Parameters: " , result)
#Performing SVC model based on best parameters.
doSVCModel(SVC(gamma='auto', C=result['C'], kernel = result['kernel']), XpcaReduced)
The accuracy score is the same because by default in SVC the C value is 1 and the kernel value is rbf, those are turned out to be the best parameters during GridSearch.